{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/device:GPU:0\n"
     ]
    }
   ],
   "source": [
    "import collections\n",
    "from collections import defaultdict\n",
    "import os\n",
    "import random\n",
    "import tarfile\n",
    "import tensorflow as tf\n",
    "from tensorflow import keras\n",
    "from tensorflow.keras import preprocessing\n",
    "from tensorflow.keras import layers\n",
    "from tensorflow.keras.preprocessing.text import text_to_word_sequence, one_hot, Tokenizer\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "import numpy as np\n",
    "import sys\n",
    "import time\n",
    "import os\n",
    "sys.path.append(\"..\")\n",
    "import d2lzh_tensorflow2 as d2l\n",
    "print(tf.test.gpu_device_name())\n",
    "DATA_ROOT = \"../../data\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torchtext.vocab as Vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "AUTO = tf.data.experimental.AUTOTUNE\n",
    "physical_devices = tf.config.list_physical_devices('GPU')\n",
    "tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.1.0\n"
     ]
    }
   ],
   "source": [
    "print(tf.__version__)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10.7.1. 文本情感分类数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 10.7.1.1. 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#数据放入data目录下，代码解压速度较慢，如果不想用代码解压，也可直接手动解压，跳过这一步\n",
    "import os\n",
    "path = os.getcwd()#返回当前进程的工作目录\n",
    "a_path = os.path.abspath(os.path.join(path, \"../../data/aclImdb_v1.tar.gz\"))\n",
    "with tarfile.open(a_path, 'r') as f:\n",
    "    f.extractall(DATA_ROOT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 本函数已保存在d2lzh_tensorflow2包中方便以后使用\n",
    "def read_imdb(folder='train', data_root=\"../../data/aclImdb/\"):\n",
    "    data = []\n",
    "    for label in ['pos', 'neg']:\n",
    "        folder_name = os.path.join(data_root, folder, label)\n",
    "        for file in (os.listdir(folder_name)):\n",
    "            with open(os.path.join(folder_name, file), 'rb') as f:\n",
    "                review = f.read().decode('utf-8').replace('\\n', '').lower()\n",
    "                data.append([review, 1 if label == 'pos' else 0])\n",
    "    random.shuffle(data)\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data, test_data = read_imdb('train'), read_imdb('test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['no, this hilariously horrible 70\\'s made-for-tv horror clinker isn\\'t about a deadly demonically possessed dessert cake. still, this exceptionally awful, yet undeniably amusing and thus enjoyable cathode ray refuse reaches a breathtaking apex of absolute, unremitting silliness and atrociousness that\\'s quite tasty in a so-execrable-it\\'s-downright-awesome sort of way. richard crenna, looking haggard and possibly inebriated, and yvette mimieux, who acts as if she never got over the brutal rape she endured in \"jackson county jail,\" sluggishly portray a disgustingly nice and respectable suburbanite couple whose quaint, dull, sleepy small town existence gets ripped asunder when the cute german shepard they take in as the family pet turns out to be some ancient lethal evil spirit. pretty soon mimieux and her two repellently cutesy kids kim richards and ike eisenmann (the psychic alien moppets from the disney \"witch mountain\" pictures) are worshiping a crude crayon drawing of the nasty, ugly canine entity in the den. boy, now doesn\\'t that sound really scary and disturbing? well, scary and disturbing this laughably ludicrous claptrap sure ain\\'t, but it sure is funny, thanks to curtis (\"night tide\") harrington\\'s hopelessly weak direction, cartoonish (not so) special effects, an almost painfully risible\\'n\\'ridiculous plot, and a game cast that struggles valiantly with the absurd story (besides the leads, both martine beswicke and r.g. armstrong briefly pop up as members of a satanic cult and victor jory has a nice cameo as a helpful native american shaman). favorite scene: the malicious mephestophelion mutt puts the whammy on crenna, practically forcing him to stick his hand into a wildly spinning lawnmower blade. while stuck-up snobby fright film fans may hold their noses at the perfectly putrid stench of this admittedly smelly schlock, devout tv trash lovers should deem this endearingly abominable offal the boob tube equivalent to alpo.',\n",
       " 1]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#每条数据中包含一个字符串评论和一个标签\n",
    "train_data[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 10.7.1.2. 预处理数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "我们需要对每条评论做分词，从而得到分好词的评论。这里定义的get_tokenized_imdb函数使用最简单的方法：基于空格进行分词。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 本函数已保存在d2lzh_tensorflow2包中方便以后使用\n",
    "def get_tokenized_imdb(data):\n",
    "    \"\"\"\n",
    "    data: list of [string, label]\n",
    "    返回: 每一条评论的单词所组成的列表\n",
    "    \"\"\"\n",
    "    def tokenizer(text):\n",
    "        #基于空格进行分词，并都转换为小写\n",
    "        return [tok.lower() for tok in text.split(' ')]\n",
    "\n",
    "    return [tokenizer(review) for review, _ in data]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = get_tokenized_imdb(train_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "25000"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Counter对列表中的单词进行计数，并返回一个字典\n",
    "counter = collections.Counter([tk for st in text for tk in st])\n",
    "# vocab是一个字典，键表示单词，值表示单词出现的频率\n",
    "vocab = {w: freq for w, freq in counter.most_common() if freq > 5}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "现在，我们可以根据分好词的训练数据集来创建词典了。我们在这里过滤掉了出现次数少于5的词."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('# words in vocab:', 46152)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 本函数已保存在d2lzh_tensorflow2包中方便以后使用\n",
    "def get_vocab_imdb(data):\n",
    "    tokenized_data = get_tokenized_imdb(data)\n",
    "    #counter已经创建了一个词典,统计了每个词出现的频率\n",
    "    counter = collections.Counter([tk for st in tokenized_data for tk in st])\n",
    "    return Vocab.Vocab(counter, min_freq=5)\n",
    "    #text值保存counter中出现频率大于等于五的词\n",
    "#     text = {w: freq for w, freq in counter.most_common() if freq >= 5}\n",
    "#     vocab ={index:word for word,index in enumerate(text.keys())}\n",
    "#     return vocab\n",
    "    #return Vocab.Vocab(counter, min_freq=5)\n",
    "\n",
    "\n",
    "vocab = get_vocab_imdb(train_data)\n",
    "'# words in vocab:', len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<torchtext.vocab.Vocab at 0x1fc83d81708>"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "因为每条评论长度不一致所以不能直接组合成小批量，我们定义preprocess_imdb函数对每条评论进行分词，并通过词典转换成词索引，然后通过截断或者补“<pad>”（padding）符号来将每条评论长度固定成500。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "#在此处使用tensorflow2的填充函数进行填充\n",
    "def preprocess_imdb(data, vocab):  # 本函数已保存在d2lzh_tensorflow2包中方便以后使用\n",
    "    max_l = 500\n",
    "\n",
    "    # 将每条评论通过截断或者补0，使得长度变成500\n",
    "    def pad(x):\n",
    "        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))\n",
    "    \n",
    "    #tokenized_data为一个二维的列表,里面有我们分好的词\n",
    "    tokenized_data = get_tokenized_imdb(data)\n",
    "    \n",
    "     #将每个词转换为词索引并进行截断或补0\n",
    "    features = tf.Variable([pad([vocab[word] for word in words] ) for words in tokenized_data])\n",
    "    labels = tf.Variable([score for _, score in data])\n",
    "    return features, labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "data, label = preprocess_imdb(train_data, vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<tf.Variable 'Variable:0' shape=(25000, 500) dtype=int32, numpy=\n",
       "array([[1378,   10, 5002, ...,    0,    0,    0],\n",
       "       [   2, 3047,  160, ...,    0,    0,    0],\n",
       "       [  94,    0,  620, ...,    0,    0,    0],\n",
       "       ...,\n",
       "       [  86,  347,   57, ...,    0,    0,    0],\n",
       "       [   9,  199,   11, ...,    0,    0,    0],\n",
       "       [  69,   64,  381, ...,    0,    0,    0]])>"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "10.7.1.3. 创建数据迭代器\n",
    "现在，我们创建数据迭代器。每次迭代将返回一个小批量的数据。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 64\n",
    "train_set = (tf.data.Dataset.from_tensor_slices(\n",
    "    ((preprocess_imdb(train_data, vocab))))\n",
    "    .repeat()\n",
    "    .shuffle(2048)\n",
    "    .batch(batch_size)\n",
    "    .prefetch(AUTO))\n",
    "test_set = (tf.data.Dataset.from_tensor_slices(\n",
    "    ((preprocess_imdb(test_data, vocab))))\n",
    "    .shuffle(2048)\n",
    "    .batch(batch_size)\n",
    "    .prefetch(AUTO))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X (64, 500) y (64,)\n",
      "tf.Tensor(\n",
      "[[   9   86  118 ...    0    0    0]\n",
      " [  10   14    3 ...    0    0    0]\n",
      " [  10 3717   39 ...    0    0    0]\n",
      " ...\n",
      " [   9  308   45 ...    0    0    0]\n",
      " [  10   20    7 ...    0    0    0]\n",
      " [ 481    0  270 ...    0    0    0]], shape=(64, 500), dtype=int32) tf.Tensor(\n",
      "[0 0 0 1 1 1 1 0 0 1 1 0 1 0 1 0 0 0 0 0 1 1 1 1 1 0 1 1 0 1 1 1 0 1 0 1 0\n",
      " 1 1 1 0 0 1 1 1 0 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 0 1 1], shape=(64,), dtype=int32)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "('#batches:', 390)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for X, y in train_set:\n",
    "    print('X', X.shape, 'y', y.shape)\n",
    "    print(X,y)\n",
    "    break\n",
    "'#batches:', data.shape[0]//batch_size"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 10.7.2. 使用循环神经网络的模型\n",
    "在这个模型中，每个词先通过嵌入层得到特征向量。然后，我们使用双向循环神经网络对特征序列进一步编码得到序列信息。最后，我们将编码的序列信息通过全连接层变换为输出。具体来说，我们可以将双向长短期记忆在最初时间步和最终时间步的隐藏状态连结，作为特征序列的表征传递给输出层分类。在下面实现的BiRNN类中，Embedding实例即嵌入层，LSTM实例即为序列编码的隐藏层，Dense实例即生成分类结果的输出层。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "#因为tensorflow并没有像pytorch，mxnet关于glove接口的api，所以必须要重写一个\n",
    "\n",
    "def load_embedding_from_disks(glove_filename, with_indexes=True):\n",
    "    \"\"\"\n",
    "    Read a GloVe txt file. If `with_indexes=True`, we return a tuple of two dictionnaries\n",
    "    `(word_to_index_dict, index_to_embedding_array)`, otherwise we return only a direct \n",
    "    `word_to_embedding_dict` dictionnary mapping from a string to a numpy array.\n",
    "    \"\"\"\n",
    "    if with_indexes:\n",
    "        word_to_index_dict = dict()\n",
    "        index_to_embedding_array = []\n",
    "        index_to_word_dict = dict()\n",
    "        word_to_embedding = dict()\n",
    "    else:\n",
    "        word_to_embedding_dict = dict()\n",
    "\n",
    "    \n",
    "    with open(glove_filename, 'r',encoding='utf-8') as glove_file:\n",
    "        for (i, line) in enumerate(glove_file):\n",
    "            \n",
    "            split = line.split(' ')\n",
    "            \n",
    "            word = split[0]\n",
    "            \n",
    "            representation = split[1:]\n",
    "            representation = np.array(\n",
    "                [float(val) for val in representation]\n",
    "            )\n",
    "            \n",
    "            if with_indexes:\n",
    "                word_to_index_dict[word] = i\n",
    "                index_to_word_dict[i] = word\n",
    "                word_to_embedding[word] = representation\n",
    "                index_to_embedding_array.append(representation)\n",
    "            else:\n",
    "                word_to_embedding_dict[word] = representation\n",
    "\n",
    "    _WORD_NOT_FOUND = [0.0]* len(representation)  # Empty representation for unknown words.\n",
    "    if with_indexes:\n",
    "        _LAST_INDEX = i + 1\n",
    "        word_to_index_dict = defaultdict(lambda: _LAST_INDEX, word_to_index_dict)\n",
    "        index_to_embedding_array = np.array(index_to_embedding_array + [_WORD_NOT_FOUND])\n",
    "        return word_to_index_dict, index_to_embedding_array,index_to_word_dict,word_to_embedding\n",
    "    else:\n",
    "        word_to_embedding_dict = defaultdict(lambda: _WORD_NOT_FOUND)\n",
    "        return word_to_embedding_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "word_to_index, index_to_embedding, index_to_word,word_to_embedding = load_embedding_from_disks(\"C:/Users/HP/dive into d2l/code/chapter10_natural-language-processing/embeddings/ GloVe.6B/glove.6B.50d.txt\", with_indexes=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 本函数已保存在d2lzh_tensorflow包中方便以后使用\n",
    "def get_weights(vocab, word_to_embedding,embedding_dim,word_to_index,index_to_embedding):\n",
    "    \"\"\"从预训练好的vocab中提取出words对应的词向量\"\"\"\n",
    "    embedding_matrix = np.zeros((len(vocab), embedding_dim))\n",
    "#     embedding_matrix = np.zeros((len(vocab), embedding_dim))\n",
    "    for index, word in enumerate(vocab.itos):\n",
    "        if word in word_to_embedding.keys():\n",
    "            embedding_matrix[index] = index_to_embedding[index]\n",
    "    return embedding_matrix\n",
    "embedding_matrix = get_weights(vocab,word_to_embedding,50,word_to_index,index_to_embedding)\n",
    "# net.embedding.set_weights([embedding_matrix])\n",
    "# net.trainable = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "embed_size, num_hiddens, max_len = 50, 100, 500\n",
    "num_epochs = 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = tf.keras.Sequential([\n",
    "    layers.Embedding(len(vocab), embed_size,weights=[embedding_matrix],input_length=500),\n",
    "    layers.Bidirectional(layers.LSTM(num_hiddens)),\n",
    "    tf.keras.layers.Dense(2,activation='softmax')\n",
    "])\n",
    "\n",
    "model.layers[0].trainable = False\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: \"sequential_2\"\n",
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "embedding_4 (Embedding)      (None, 500, 50)           2307600   \n",
      "_________________________________________________________________\n",
      "bidirectional_4 (Bidirection (None, 200)               120800    \n",
      "_________________________________________________________________\n",
      "dense_4 (Dense)              (None, 2)                 402       \n",
      "=================================================================\n",
      "Total params: 2,428,802\n",
      "Trainable params: 121,202\n",
      "Non-trainable params: 2,307,600\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(tf.keras.optimizers.Adam(0.01),\n",
    "            loss='sparse_categorical_crossentropy',\n",
    "            metrics=['sparse_categorical_accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train for 390 steps, validate for 391 steps\n",
      "Epoch 1/5\n",
      "390/390 [==============================] - 33s 86ms/step - loss: 0.6919 - sparse_categorical_accuracy: 0.5321 - val_loss: 0.6814 - val_sparse_categorical_accuracy: 0.5680\n",
      "Epoch 2/5\n",
      "390/390 [==============================] - 31s 80ms/step - loss: 0.6584 - sparse_categorical_accuracy: 0.6091 - val_loss: 0.6442 - val_sparse_categorical_accuracy: 0.6415\n",
      "Epoch 3/5\n",
      "390/390 [==============================] - 32s 82ms/step - loss: 0.6078 - sparse_categorical_accuracy: 0.6710 - val_loss: 0.6638 - val_sparse_categorical_accuracy: 0.6090\n",
      "Epoch 4/5\n",
      "390/390 [==============================] - 32s 82ms/step - loss: 0.5892 - sparse_categorical_accuracy: 0.6888 - val_loss: 0.6524 - val_sparse_categorical_accuracy: 0.6535\n",
      "Epoch 5/5\n",
      "390/390 [==============================] - 33s 84ms/step - loss: 0.5105 - sparse_categorical_accuracy: 0.7536 - val_loss: 0.5836 - val_sparse_categorical_accuracy: 0.7169\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<tensorflow.python.keras.callbacks.History at 0x1fc057bf708>"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(\n",
    "    train_set,\n",
    "    steps_per_epoch=data.shape[0]//batch_size,\n",
    "    validation_data= test_set,\n",
    "    epochs=5\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "创建一个含一个隐藏层的双向循环神经网络"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "然后，我们将用这些词向量作为评论中每个词的特征向量。注意，预训练词向量的维度需要与创建的模型中的嵌入层输出大小embed_size一致。此外，在训练中我们不再更新这些词向量。因为tensorflow并没有封装预训练词向量，所以我们要重新实现"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 本函数已保存在d2lzh_pytorch包中方便以后使用\n",
    "def predict_sentiment(net, vocab, sentence):\n",
    "    \"\"\"sentence是词语的列表\"\"\"\n",
    "    sentence = tf.Variable([vocab.stoi[word] for word in sentence])\n",
    "    print(sentence)\n",
    "    print(tf.reshape(sentence,[1,-1]))\n",
    "    label = np.argmax(net(tf.reshape(sentence,[1,-1])), axis=1)\n",
    "    return 'positive' if np.array(label) == 1 else 'negative'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<tf.Variable 'Variable:0' shape=(5,) dtype=int32, numpy=array([10, 20,  7, 38, 88])>\n",
      "tf.Tensor([[10 20  7 38 88]], shape=(1, 5), dtype=int32)\n",
      "WARNING:tensorflow:Model was constructed with shape Tensor(\"embedding_4_input:0\", shape=(None, 500), dtype=float32) for input (None, 500), but it was re-called on a Tensor with incompatible shape (1, 5).\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'positive'"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predict_sentiment(model, vocab, ['this', 'movie', 'is', 'so', 'great'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<tf.Variable 'Variable:0' shape=(5,) dtype=int32, numpy=array([10, 20,  7, 38, 97])>\n",
      "WARNING:tensorflow:Model was constructed with shape Tensor(\"embedding_4_input:0\", shape=(None, 500), dtype=float32) for input (None, 500), but it was re-called on a Tensor with incompatible shape (1, 5).\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'negative'"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predict_sentiment(model, vocab, ['this', 'movie', 'is', 'so', 'bad'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
